Required packages
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
#repos='http://cran.muenster.r-project.org'
stdata = c("sp", "sf", "raster")
Stat_methods = c("glmnet", "ranger", "gbm", "xgboost", "party", "caret", "gstat")
visual = c("RColorBrewer", "ggplot2", "corrplot", "tmap", "leaflet", "mapview","leafem", "pdp", "vip", "DT", "sparkline")
map = c("maptools")
tidy = c("devtools", "dplyr", "tidyr", "reshape2", "knitr")
other = c("countrycode", "htmlwidgets", "data.table", "Matrix", "GGally")
packages <- c(stdata, tidy, Stat_methods, visual, map, other)
ipak(packages)
## sp sf raster devtools dplyr tidyr
## TRUE TRUE TRUE TRUE TRUE TRUE
## reshape2 knitr glmnet ranger gbm xgboost
## TRUE TRUE TRUE TRUE TRUE TRUE
## party caret gstat RColorBrewer ggplot2 corrplot
## TRUE TRUE TRUE TRUE TRUE TRUE
## tmap leaflet mapview leafem pdp vip
## TRUE TRUE TRUE TRUE TRUE TRUE
## DT sparkline maptools countrycode htmlwidgets data.table
## TRUE TRUE TRUE TRUE TRUE TRUE
## Matrix GGally
## TRUE TRUE
APMtools is an R package providing datasets for global air pollution modelling and tools to streamline and facilitate air pollution mapping using statistical methods. Please have a read of it on Github.
install_github("mengluchu/APMtools")
library(APMtools)
ls("package:APMtools")
## [1] "Brt_imp" "Brt_LUR" "Brt_pre"
## [4] "cforest_LUR" "countrywithppm" "create_ring"
## [7] "ctree_LUR" "DENL_new" "error_matrix"
## [10] "global_annual" "join_by_id" "Lasso"
## [13] "Lasso_pre" "Lassoselected" "mechanical"
## [16] "merge_roads" "merged" "merged_nightlight"
## [19] "mergeraster2file" "plot_error" "plot_rsq"
## [22] "ppm2ug" "predicLA_RF_XGBtiles" "RDring_coef"
## [25] "removedips" "retrieve_predictor" "rf_imp"
## [28] "rf_LUR" "rf_pre" "sampledf"
## [31] "scatterplot" "subset_grep" "univar_rsq"
## [34] "xgb_pre" "xgboost_imp" "xgboost_LUR"
Load data:
#gd = fread("~/Documents/GitHub/Global mapping/2020_06_world/stations_20200602.csv")
#avg = fread("~/Documents/GitHub/Global mapping/oaqEUAUCAUS.csv")
#gdata = merge(gd, avg, by.x = c("long", "lat"), by.y = c("LONGITUDE","LATITUDE" ))
#g1 = na_if(gdata, -9999.9)
#g2 = g1%>%dplyr::select(-id, -dir, -V1)%>%filter(value_mean >0)
data("global_annual")
Predictor variables:
vastring = "road|nightlight|population|temp|wind|trop|indu|elev"
Dataset:
1. road_class_XX_size: road lenght within a buffer with radius “size” of type XX. ROAD_1: highway, ROAD_2: primary, ROAD_3: secondary, ROAD_4: tertiary, ROAD_5: unpaved
2. industry_size: Industrial area within a buffer with radius “size”.
3. trop_mean: TROPOMI averaged over Feb 2018 - Jan 2019.
4. temperature_2m_m: monthly mean temperature at 2m height of month “m”.
5. wind_speed_10m_m:monthly mean wind speed at 2m height of month “m”.
6. poppulation_1000/ 3000 /5000: population 1, 3, 5 km resolution.
7. Rsp: Surface remote sensing and chemical transport model product (only for 2012).
8. OMI_mean_filt: OMI column density, 2017 annual average.
9. nightlight_size: nightlight VIIRS data in original resolution (500 m) and various buffer sizes.
NO2 annual concentration, all the units are converted to ug/m3 (micro per cubic meter). In the data display later you can see countries with different units.
global_annual %>% dplyr::select(value_mean ) %>% summary()
## value_mean
## Min. : 0.00022
## 1st Qu.:13.85912
## Median :22.48644
## Mean :24.37987
## 3rd Qu.:32.65573
## Max. :98.34452
#datatable(g2, rownames = FALSE, filter = "top", options = list(pageLength = 5, scrollX = T))
Merge roads of different road types, here 3, 4, 5, the road length of these road types are aggregated. The original road types are substituted (with keep = T, they are remained).
merged_mr = merge_roads(global_annual, c(3, 4, 5), keep = F) # keep = T keeps the original roads.
names(merged_mr)
## [1] "road_class_M345_1000" "road_class_M345_100" "road_class_M345_25"
## [4] "road_class_M345_3000" "road_class_M345_300" "road_class_M345_5000"
## [7] "road_class_M345_500" "road_class_M345_50" "road_class_M345_800"
## [10] "long" "lat" "nightlight_800"
## [13] "nightlight_500" "nightlight_5000" "nightlight_3000"
## [16] "nightlight_1000" "elevation" "industry_1000"
## [19] "industry_100" "industry_25" "industry_3000"
## [22] "industry_300" "industry_5000" "industry_500"
## [25] "industry_50" "industry_800" "OMI_mean_filt"
## [28] "population_1000" "population_3000" "population_5000"
## [31] "road_class_1_1000" "road_class_1_100" "road_class_1_25"
## [34] "road_class_1_3000" "road_class_1_300" "road_class_1_5000"
## [37] "road_class_1_500" "road_class_1_50" "road_class_1_800"
## [40] "road_class_2_1000" "road_class_2_100" "road_class_2_25"
## [43] "road_class_2_3000" "road_class_2_300" "road_class_2_5000"
## [46] "road_class_2_500" "road_class_2_50" "road_class_2_800"
## [49] "Rsp" "temperature_2m_10" "temperature_2m_11"
## [52] "temperature_2m_12" "temperature_2m_1" "temperature_2m_2"
## [55] "temperature_2m_3" "temperature_2m_4" "temperature_2m_5"
## [58] "temperature_2m_6" "temperature_2m_7" "temperature_2m_8"
## [61] "temperature_2m_9" "trop_mean_filt" "wind_speed_10m_10"
## [64] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_1"
## [67] "wind_speed_10m_2" "wind_speed_10m_3" "wind_speed_10m_4"
## [70] "wind_speed_10m_5" "wind_speed_10m_6" "wind_speed_10m_7"
## [73] "wind_speed_10m_8" "wind_speed_10m_9" "value_mean"
## [76] "country"
Visualize with tmap: convenient
locations_sf = st_as_sf(merged_mr, coords = c("long","lat"))
osm_valuemean = tm_shape(locations_sf) +
tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
popup.vars = c("value_mean" )) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "NO2mean.html")
climatemissing= merged_mr%>%filter(is.na(wind_speed_10m_10))
locations_sf = st_as_sf(climatemissing, coords = c("long","lat"))
osm_valuemean = tm_shape(locations_sf) +
tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
popup.vars = c("value_mean" )) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "climatemissing.html")
Visualize with leaflet: more control. show Day/night ratio, red: day/night >1, blue, day/nigh <1
merged_fp = merged_mr %>% mutate(ratiodn = day_value/night_value) %>% mutate(color = ifelse(ratiodn > 1, "red", "blue"))
m = leaflet(merged_fp) %>%
addTiles() %>% addCircleMarkers(radius = ~value_mean/5, color = ~color, popup = ~as.character(value_mean),fill = FALSE) %>% addProviderTiles(providers$Esri.NatGeoWorldMap) %>% addMouseCoordinates() %>%
addHomeButton(ext = extent(116.2, 117, 39.7, 40), layer.name = "Beijing") %>% addHomeButton(ext = extent(5, 5.2, 52, 52.2), layer.name = "Utrecht")
saveWidget(m, file = "NO2daynight.html")
Boxplot
countryname = paste(merged_mr$country, countrycode(merged_mr$country, 'iso2c', 'country.name'), sep = ":")
#tag country with ppm
# use the median for colour
mergedmedian = merged_mr %>% group_by(country) %>% mutate(median = median(value_mean, na.rm = TRUE))
nrow(merged_mr)
## [1] 5521
merged_mr = merged_mr %>% group_by(country) %>% mutate(count1 = n())
merged_mr$count1
## [1] 456 456 456 190 190 190 190 190 190 190 190 456 456 190
## [15] 190 190 190 190 190 190 190 190 190 190 190 190 190 190
## [29] 190 190 190 190 190 456 190 456 190 190 190 456 190 190
## [43] 456 456 190 456 456 456 190 456 190 190 456 456 456 456
## [57] 456 456 456 456 190 456 190 456 456 456 456 456 456 190
## [71] 456 456 456 456 456 456 190 456 456 456 456 456 456 456
## [85] 456 190 456 456 456 456 456 456 456 456 190 456 456 456
## [99] 456 456 456 456 456 456 456 456 456 456 456 456 456 190
## [113] 456 190 456 456 190 456 456 456 456 190 456 456 456 456
## [127] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [141] 456 190 456 456 190 456 456 456 456 456 456 456 456 456
## [155] 456 190 456 456 456 456 456 456 456 456 456 456 190 456
## [169] 190 456 456 456 456 456 190 456 456 456 190 190 190 190
## [183] 190 190 190 190 190 190 190 190 190 190 190 190 190 190
## [197] 190 190 456 190 190 190 190 190 190 190 190 190 456 456
## [211] 456 456 456 456 456 456 456 190 456 456 456 456 190 190
## [225] 190 190 190 190 190 456 456 190 190 456 456 456 190 190
## [239] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [253] 456 456 456 190 456 456 456 456 456 456 456 190 456 456
## [267] 456 456 456 456 456 456 456 190 456 456 456 456 456 456
## [281] 456 456 456 456 456 456 456 190 456 456 456 456 456 456
## [295] 456 456 456 456 456 456 456 456 456 190 456 456 456 456
## [309] 456 190 456 456 456 456 456 456 456 456 456 6 6 6
## [323] 6 6 6 456 456 456 456 456 456 456 456 456 456 456
## [337] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [351] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [365] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [379] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [393] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [407] 456 456 456 456 456 190 456 456 456 456 456 456 456 456
## [421] 456 456 456 456 456 456 456 456 456 456 456 456 456 456
## [435] 456 456 456 456 456 456 190 456 456 456 456 456 456 456
## [449] 456 190 456 190 456 456 456 456 456 456 456 456 190 456
## [463] 190 190 190 456 456 456 190 456 456 456 190 190 190 456
## [477] 456 456 456 190 456 190 456 190 190 456 456 456 456 456
## [491] 456 456 456 190 456 190 456 456 456 190 190 190 190 190
## [505] 456 190 190 190 456 190 190 190 190 190 190 190 190 456
## [519] 190 190 456 456 190 456 456 456 456 456 190 456 456 456
## [533] 456 456 456 190 456 456 456 456 456 456 456 456 456 456
## [547] 456 456 456 456 190 456 456 456 456 190 190 190 456 456
## [561] 456 456 456 456 456 456 456 456 190 456 456 456 190 456
## [575] 456 456 456 456 190 456 456 456 456 190 190 190 190 190
## [589] 190 190 190 190 190 190 190 190 190 456 456 9 456 456
## [603] 456 456 456 456 456 456 456 456 456 456 456 190 190 456
## [617] 456 456 456 456 9 456 9 9 9 9 9 9 9 456
## [631] 456 190 190 456 190 190 190 456 190 190 456 190 190 190
## [645] 190 190 190 190 190 190 190 190 190 190 445 445 445 190
## [659] 445 445 445 445 445 445 190 190 190 190 190 190 445 445
## [673] 445 15 15 15 15 15 15 15 15 15 15 15 15 15
## [687] 15 15 53 12 12 12 12 12 12 12 12 12 12 506
## [701] 506 506 12 12 506 506 53 53 506 506 506 506 506 506
## [715] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [729] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [743] 506 506 506 506 506 506 506 506 53 53 53 14 53 53
## [757] 53 53 506 53 53 53 53 53 506 53 53 53 53 53
## [771] 53 53 506 53 53 53 53 53 506 506 506 53 53 506
## [785] 53 506 53 53 53 53 53 506 53 53 53 53 506 506
## [799] 53 506 506 506 506 506 14 506 506 53 53 506 506 506
## [813] 53 506 53 53 506 53 506 53 506 53 506 506 53 506
## [827] 53 506 506 506 506 53 506 506 506 53 506 506 506 506
## [841] 506 152 53 14 14 14 506 506 506 506 506 506 506 506
## [855] 14 506 506 506 506 152 506 506 506 506 506 506 14 506
## [869] 14 506 14 14 152 506 14 152 506 506 14 14 506 506
## [883] 14 506 506 506 506 506 506 506 506 506 152 506 506 506
## [897] 506 506 152 506 506 506 506 506 506 506 506 506 506 506
## [911] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [925] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [939] 506 506 506 506 506 506 506 3 3 3 506 506 152 506
## [953] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [967] 506 152 506 506 506 506 152 506 152 506 445 445 506 445
## [981] 506 506 506 506 506 506 152 152 506 152 152 152 152 506
## [995] 152 506 506 506 506 445 506 506 506 506 506 506 506 152
## [1009] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [1023] 506 152 506 506 506 506 506 152 506 506 152 506 506 152
## [1037] 506 506 506 506 506 506 506 506 506 506 506 506 506 506
## [1051] 506 506 506 506 506 152 506 506 506 506 506 506 506 506
## [1065] 506 506 152 506 506 506 506 506 506 445 506 445 506 506
## [1079] 506 506 506 152 152 152 152 152 152 506 506 506 152 506
## [1093] 506 152 152 152 506 152 152 152 506 152 506 506 152 506
## [1107] 152 506 506 506 506 506 506 152 506 445 445 445 152 152
## [1121] 152 506 152 506 152 152 506 506 152 506 506 152 152 152
## [1135] 506 152 506 506 152 506 152 506 506 506 152 152 152 506
## [1149] 445 152 506 152 506 445 506 445 152 506 506 506 445 506
## [1163] 152 152 152 152 152 445 506 506 506 445 506 152 506 152
## [1177] 445 506 506 152 506 506 506 445 506 506 506 152 506 506
## [1191] 506 445 152 152 506 445 506 152 506 152 152 506 506 445
## [1205] 445 445 506 506 506 506 506 445 152 152 445 445 152 445
## [1219] 152 445 445 152 152 445 152 445 152 152 445 152 152 152
## [1233] 152 152 152 152 445 152 152 152 152 152 152 506 506 152
## [1247] 152 152 506 506 152 152 445 152 445 152 445 506 506 152
## [1261] 152 152 445 506 506 152 445 152 152 506 506 445 506 152
## [1275] 152 445 506 152 506 506 506 506 445 506 152 506 506 506
## [1289] 506 506 152 506 506 152 445 445 506 506 506 506 445 152
## [1303] 506 445 445 506 445 445 445 445 445 445 445 152 445 445
## [1317] 506 506 445 506 506 506 506 445 152 152 506 445 506 152
## [1331] 506 445 152 506 506 506 506 506 445 445 445 152 506 506
## [1345] 506 445 445 445 506 152 506 506 445 445 152 152 506 506
## [1359] 152 506 506 506 506 506 152 152 506 506 152 445 506 152
## [1373] 152 506 152 506 445 152 152 152 506 506 152 506 152 506
## [1387] 506 445 152 445 506 506 506 445 152 506 506 445 445 445
## [1401] 445 152 445 445 445 445 445 445 152 506 152 506 445 445
## [1415] 445 445 152 152 152 445 445 445 506 506 152 445 506 152
## [1429] 445 506 506 445 152 445 152 445 445 506 445 506 506 445
## [1443] 506 445 506 506 506 445 506 152 445 445 445 445 445 152
## [1457] 445 506 445 506 506 506 506 445 506 445 506 506 506 445
## [1471] 152 445 506 445 506 445 506 506 445 445 445 445 445 1
## [1485] 445 506 506 506 506 506 445 445 506 445 445 445 506 506
## [1499] 445 506 506 445 445 445 445 506 445 445 506 506 506 506
## [1513] 445 506 506 506 506 506 445 506 445 506 506 506 506 506
## [1527] 506 445 506 445 506 506 506 506 506 506 506 506 506 506
## [1541] 506 445 445 506 445 445 445 445 445 445 445 445 506 506
## [1555] 445 445 445 445 445 445 445 445 445 445 445 445 445 506
## [1569] 445 445 445 445 445 445 445 445 445 445 445 445 445 445
## [1583] 445 506 445 445 445 445 445 506 445 445 445 82 445 445
## [1597] 445 506 506 445 445 445 445 506 445 445 506 506 445 445
## [1611] 506 445 445 445 445 445 506 506 445 445 506 445 445 445
## [1625] 445 445 445 445 445 445 445 506 82 445 445 82 445 506
## [1639] 445 82 445 445 445 445 506 82 445 82 445 445 82 445
## [1653] 445 445 445 445 445 445 82 82 82 82 82 82 73 82
## [1667] 82 82 82 506 445 82 445 445 445 445 73 82 82 445
## [1681] 445 445 445 445 445 445 445 445 445 445 445 73 82 73
## [1695] 82 82 506 506 82 445 82 82 73 73 506 73 82 82
## [1709] 82 82 82 82 73 82 445 73 73 445 82 445 82 445
## [1723] 82 82 82 82 82 445 445 82 82 82 73 82 445 445
## [1737] 82 82 82 82 82 82 445 73 82 82 82 82 82 73
## [1751] 82 82 73 445 82 73 73 73 82 73 82 73 445 73
## [1765] 82 445 73 73 445 445 445 73 445 82 73 73 445 445
## [1779] 445 445 73 445 73 73 445 73 73 73 445 445 445 445
## [1793] 73 445 445 445 73 445 73 445 82 73 445 445 73 445
## [1807] 445 445 445 445 73 445 73 73 445 445 73 445 445 445
## [1821] 445 73 73 73 445 73 445 445 445 445 445 82 73 73
## [1835] 445 82 445 82 445 445 73 445 445 445 445 445 82 445
## [1849] 73 73 73 82 73 445 82 445 73 82 445 445 82 73
## [1863] 44 445 445 445 44 44 44 44 44 445 82 445 445 73
## [1877] 445 82 445 445 445 73 445 445 445 445 73 445 82 445
## [1891] 82 445 73 445 82 73 82 82 82 82 445 445 445 73
## [1905] 445 44 445 44 445 445 445 445 445 7 73 73 73 445
## [1919] 445 73 445 445 445 445 445 73 7 73 82 82 35 445
## [1933] 73 445 73 445 445 35 445 445 411 411 445 445 7 445
## [1947] 7 445 445 7 35 445 44 445 445 445 445 7 445 445
## [1961] 445 445 411 445 445 35 445 445 445 445 445 73 411 445
## [1975] 7 445 411 445 445 73 411 411 445 411 411 445 411 73
## [1989] 411 445 73 73 35 411 35 411 35 411 411 411 445 445
## [2003] 411 445 411 411 411 411 411 445 411 411 445 411 577 445
## [2017] 445 445 411 411 411 445 411 411 411 445 445 73 35 411
## [2031] 445 411 445 411 411 577 35 411 411 411 411 411 411 411
## [2045] 411 445 411 445 411 411 577 411 411 411 411 411 445 411
## [2059] 411 35 411 411 411 411 411 411 445 577 445 411 577 445
## [2073] 445 411 445 577 577 577 445 411 445 445 577 35 411 411
## [2087] 35 411 411 35 411 411 577 445 577 577 411 445 411 35
## [2101] 411 35 411 35 411 411 411 411 577 411 577 577 577 577
## [2115] 411 577 445 577 445 411 445 411 445 577 411 577 445 411
## [2129] 577 411 411 577 411 411 577 411 411 411 44 44 577 411
## [2143] 577 411 577 411 411 411 411 445 577 577 35 577 411 577
## [2157] 577 577 577 411 411 411 411 411 411 411 577 411 411 411
## [2171] 411 577 577 577 577 577 577 35 35 411 577 411 411 411
## [2185] 35 411 577 411 411 411 577 577 411 577 577 577 411 411
## [2199] 577 411 411 411 577 35 411 411 411 577 577 411 411 35
## [2213] 35 411 35 35 411 411 577 411 577 35 411 35 577 411
## [2227] 411 577 577 35 577 577 577 411 411 411 411 411 411 411
## [2241] 411 411 411 411 35 411 445 445 577 445 411 577 577 445
## [2255] 411 411 577 411 577 411 577 411 411 445 577 411 577 577
## [2269] 577 577 577 577 577 411 35 577 577 577 411 411 577 411
## [2283] 35 577 577 577 577 35 35 577 577 577 411 577 577 577
## [2297] 577 411 577 411 577 411 577 411 411 577 577 411 577 577
## [2311] 577 411 577 577 577 411 411 411 411 577 411 577 445 411
## [2325] 411 577 577 577 411 411 411 577 577 411 411 411 577 411
## [2339] 577 577 411 577 577 577 577 577 577 577 411 577 35 577
## [2353] 35 577 411 577 577 445 411 445 445 445 411 577 445 411
## [2367] 577 411 44 411 577 577 577 577 411 411 577 577 577 577
## [2381] 577 149 577 577 44 411 44 149 44 577 149 44 149 411
## [2395] 577 577 577 411 411 411 411 577 577 44 577 577 411 577
## [2409] 411 411 149 411 411 577 411 411 577 577 149 577 411 577
## [2423] 577 577 411 411 577 577 577 577 577 577 577 149 411 8
## [2437] 411 577 411 411 411 411 411 577 411 411 411 411 411 411
## [2451] 411 411 577 411 577 577 411 149 577 577 411 411 411 577
## [2465] 411 411 411 411 577 411 577 8 8 577 44 577 577 577
## [2479] 411 411 411 577 577 577 411 577 411 577 411 577 577 577
## [2493] 577 577 44 577 411 577 8 44 44 577 411 577 44 411
## [2507] 577 411 411 577 411 411 44 44 411 577 411 577 577 577
## [2521] 577 577 411 577 411 577 44 411 44 411 411 577 44 577
## [2535] 577 411 411 44 411 577 411 411 44 44 44 44 577 149
## [2549] 149 411 44 411 577 411 577 411 411 577 411 411 577 44
## [2563] 577 44 411 577 44 44 411 577 577 411 411 411 411 411
## [2577] 577 577 577 577 44 44 577 411 577 577 411 411 411 577
## [2591] 411 577 411 577 411 411 411 411 577 44 411 44 577 577
## [2605] 44 577 577 577 577 577 577 411 577 577 411 411 577 577
## [2619] 577 411 577 577 577 577 577 577 577 577 411 577 577 411
## [2633] 577 577 577 577 577 411 577 411 149 577 149 149 411 577
## [2647] 149 411 577 411 411 577 411 577 149 577 411 577 577 411
## [2661] 44 411 577 411 411 411 577 411 411 577 577 577 577 411
## [2675] 577 149 149 577 577 411 411 577 411 577 577 577 577 577
## [2689] 577 577 577 577 411 411 577 577 577 411 577 577 411 411
## [2703] 577 577 577 577 149 577 577 577 411 149 50 411 577 50
## [2717] 411 411 577 411 50 50 411 577 577 577 411 149 411 411
## [2731] 411 411 411 577 50 411 577 411 411 411 411 149 411 411
## [2745] 577 411 411 577 577 411 577 577 577 577 577 577 577 577
## [2759] 577 411 577 577 577 411 411 577 577 577 411 577 577 577
## [2773] 577 411 577 577 577 577 577 411 577 577 8 411 577 411
## [2787] 577 577 577 411 411 577 577 577 577 577 577 411 577 411
## [2801] 577 577 411 411 577 411 8 577 577 411 8 577 8 577
## [2815] 577 577 50 411 577 577 68 577 577 577 577 577 577 577
## [2829] 577 577 411 68 50 50 577 50 411 577 577 149 149 411
## [2843] 149 411 50 411 50 577 50 411 577 411 577 577 577 577
## [2857] 577 149 411 577 149 411 50 577 50 50 50 411 577 577
## [2871] 149 577 149 411 411 149 149 149 411 411 411 149 149 149
## [2885] 577 411 577 50 50 149 50 149 411 577 577 577 577 411
## [2899] 577 411 577 577 411 577 68 577 577 577 411 577 577 577
## [2913] 577 411 411 411 411 577 577 577 411 68 68 68 577 577
## [2927] 411 577 68 411 411 411 577 68 411 577 411 411 411 411
## [2941] 411 577 411 577 577 149 50 577 149 577 577 411 577 411
## [2955] 577 577 577 411 577 149 68 577 68 577 149 68 411 9
## [2969] 149 149 68 577 577 411 577 577 9 577 411 411 411 577
## [2983] 411 577 577 149 13 411 577 149 411 577 68 68 68 411
## [2997] 577 577 149 68 68 411 68 577 68 577 577 68 577 577
## [3011] 577 577 50 68 4 577 577 577 577 577 577 68 577 577
## [3025] 577 149 149 577 577 411 577 149 149 577 577 577 577 149
## [3039] 577 411 577 149 149 577 149 149 149 149 411 577 411 577
## [3053] 577 577 577 577 577 149 577 411 577 68 138 577 577 68
## [3067] 577 577 68 68 4 149 577 149 68 68 149 411 68 68
## [3081] 68 577 13 4 68 9 68 149 577 411 149 68 411 4
## [3095] 138 149 411 577 138 577 68 149 577 577 577 411 149 577
## [3109] 149 577 577 577 577 577 577 411 577 149 577 149 9 149
## [3123] 149 68 138 411 577 577 577 9 138 577 149 9 149 68
## [3137] 149 68 577 149 149 577 577 149 138 138 577 149 68 577
## [3151] 149 138 577 149 577 138 577 149 9 577 577 577 577 149
## [3165] 577 138 149 149 149 149 577 149 149 149 138 149 149 577
## [3179] 149 149 138 577 149 149 577 577 13 138 68 149 149 149
## [3193] 50 50 50 149 577 577 577 9 13 149 577 68 149 138
## [3207] 577 68 149 577 577 149 577 577 149 577 577 149 149 149
## [3221] 577 577 149 149 13 13 68 149 13 577 149 577 577 149
## [3235] 149 138 138 149 9 577 149 149 577 577 138 577 577 149
## [3249] 577 138 577 577 577 149 149 149 149 149 149 149 138 68
## [3263] 149 149 149 149 149 149 149 149 13 149 149 577 149 149
## [3277] 13 13 149 149 149 577 149 149 149 149 577 577 577 50
## [3291] 577 23 577 68 577 577 68 149 138 138 68 577 149 68
## [3305] 138 9 68 149 138 577 149 577 577 138 23 577 577 149
## [3319] 577 577 138 577 577 149 138 138 25 138 138 149 138 577
## [3333] 577 25 577 25 25 50 138 50 577 68 577 577 577 577
## [3347] 68 577 577 9 577 577 577 138 50 68 577 577 68 577
## [3361] 68 23 25 577 9 50 50 23 50 50 23 68 138 138
## [3375] 577 50 577 25 23 68 50 577 577 577 138 577 577 577
## [3389] 138 577 138 13 138 50 50 577 577 25 577 138 50 50
## [3403] 577 50 50 50 50 50 25 68 577 138 13 577 25 68
## [3417] 577 577 577 138 23 23 138 138 68 138 68 68 9 68
## [3431] 9 68 23 68 9 138 138 138 138 138 9 138 68 68
## [3445] 138 138 138 68 138 138 25 50 138 138 68 138 138 9
## [3459] 138 13 138 50 23 138 25 138 138 138 23 25 23 3
## [3473] 23 44 138 9 138 138 138 23 138 138 138 23 25 138
## [3487] 138 23 25 23 138 138 138 138 3 25 3 138 138 138
## [3501] 138 138 5 138 138 138 138 5 138 5 138 25 138 138
## [3515] 138 138 138 5 138 138 138 138 138 138 5 23 25 25
## [3529] 50 138 25 5 138 5 138 138 138 23 50 23 138 133
## [3543] 5 23 138 138 25 5 138 11 50 11 25 138 138 4
## [3557] 138 138 138 138 1 138 138 15 138 133 15 133 133 138
## [3571] 133 25 25 138 138 133 133 11 11 32 11 11 138 138
## [3585] 11 138 4 32 23 11 23 11 23 23 11 32 9 133
## [3599] 15 133 5 25 133 133 138 138 138 138 32 138 50 50
## [3613] 25 32 138 15 133 138 133 11 133 138 133 32 138 23
## [3627] 133 23 23 133 133 23 133 23 133 138 23 26 133 26
## [3641] 23 138 138 133 133 138 26 26 133 138 26 15 26 133
## [3655] 26 26 23 133 133 26 133 133 133 133 133 133 133 133
## [3669] 23 23 23 23 23 133 23 23 23 133 32 23 23 133
## [3683] 23 133 32 133 23 15 133 23 15 15 133 32 133 4
## [3697] 133 133 32 15 15 133 133 133 133 15 133 133 133 133
## [3711] 26 9 32 9 26 9 133 26 26 32 133 133 133 32
## [3725] 32 32 133 133 32 133 32 133 15 15 15 15 133 133
## [3739] 133 32 32 26 133 133 133 133 133 26 32 32 26 32
## [3753] 32 133 133 26 9 133 133 26 133 133 133 133 133 133
## [3767] 133 133 133 133 133 133 133 133 133 133 133 133 133 133
## [3781] 133 102 102 133 133 9 9 102 133 133 102 133 133 133
## [3795] 26 133 102 102 102 133 102 9 133 4 102 133 26 133
## [3809] 26 133 102 133 133 26 133 133 102 32 32 32 32 133
## [3823] 26 133 133 102 133 26 26 133 26 133 133 133 102 102
## [3837] 133 133 133 133 102 9 32 102 133 133 133 133 133 133
## [3851] 133 133 102 32 133 102 133 102 32 102 102 102 32 102
## [3865] 102 102 102 102 102 102 102 102 102 102 102 102 102 102
## [3879] 102 32 102 102 102 102 102 102 102 102 102 102 102 102
## [3893] 102 102 102 102 102 102 102 102 102 102 102 102 102 3
## [3907] 3 3 88 88 88 88 88 88 88 88 88 88 88 88
## [3921] 88 88 88 102 88 88 88 88 88 88 88 88 88 88
## [3935] 88 88 88 88 88 88 88 88 88 88 88 88 88 88
## [3949] 88 88 88 88 88 88 88 88 88 102 88 88 88 88
## [3963] 102 88 88 88 88 88 88 88 102 88 88 88 88 88
## [3977] 88 88 88 88 88 88 88 88 88 88 88 88 88 88
## [3991] 88 88 88 88 88 88 88 88 88 88 102 102 102 102
## [4005] 102 102 102 102 102 102 102 102 102 102 102 102 102 102
## [4019] 102 14 14 102 14 14 14 102 14 14 14 14 14 14
## [4033] 14 14 14 102 102 102 102 102 102 102 102 102 102 102
## [4047] 102 102 102 445 445 445 445 445 445 445 445 445 445 445
## [4061] 445 17 1296 1296 1296 1296 17 17 17 17 17 17 17 17
## [4075] 17 17 17 17 17 17 1296 1296 1296 1296 1296 1296 1296 1296
## [4089] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4103] 1296 1296 1296 1296 17 17 1296 1296 1296 1296 1296 1296 1296 1296
## [4117] 1296 1296 1296 1296 1296 1296 1296 1296 16 16 16 16 16 16
## [4131] 16 16 16 16 16 16 1296 16 16 1296 1296 16 1296 1296
## [4145] 16 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4159] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4173] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4187] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4201] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4215] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4229] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4243] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4257] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4271] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4285] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4299] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4313] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4327] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4341] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4355] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4369] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4383] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4397] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4411] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4425] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4439] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4453] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4467] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4481] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4495] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4509] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4523] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4537] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4551] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4565] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4579] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4593] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4607] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4621] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4635] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4649] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4663] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4677] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4691] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4705] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4719] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4733] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4747] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4761] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4775] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4789] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4803] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4817] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4831] 1296 70 1296 70 70 70 1296 1296 70 1296 1296 1296 1296 1296
## [4845] 1296 1296 1296 1296 1296 1296 70 1296 1296 1296 1296 1296 1296 1296
## [4859] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4873] 1296 70 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4887] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4901] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4915] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4929] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4943] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4957] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4971] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4985] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [4999] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5013] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5027] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5041] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5055] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5069] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5083] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5097] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5111] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5125] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5139] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5153] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5167] 1296 1296 1296 1296 1296 1296 1296 1296 61 1296 1296 61 1296 1296
## [5181] 1296 1296 1296 1296 1296 1296 1296 61 61 1296 61 1296 1296 61
## [5195] 61 61 61 61 61 61 61 1296 1296 1296 1296 61 1296 1296
## [5209] 1296 1296 61 61 1296 1296 61 61 1296 1296 1296 61 1296 61
## [5223] 1296 1296 61 1296 61 1296 61 1296 1296 61 61 1296 1296 1296
## [5237] 1296 1296 1296 61 1296 1296 1296 1296 1296 61 1296 1296 1296 1296
## [5251] 61 61 61 1296 1296 1296 61 1296 1296 61 1296 1296 1296 1296
## [5265] 61 1296 1296 1296 1296 1296 61 1296 1296 1296 61 1296 1296 61
## [5279] 1296 61 1296 1296 1296 1296 1296 1296 1296 1296 61 61 61 61
## [5293] 1296 1296 1296 1296 1296 61 61 1296 1296 1296 1296 1296 61 1296
## [5307] 61 61 61 1296 1296 61 61 61 61 61 61 1296 61 61
## [5321] 1296 1296 61 1296 1296 1296 1296 1296 61 1296 1296 1296 1296 1296
## [5335] 1296 1296 61 1296 1296 1296 61 1296 1296 1296 61 1296 61 1296
## [5349] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5363] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5377] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5391] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5405] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5419] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5433] 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296 1296
## [5447] 1296 1296 1296 1296 1296 1296 1296 1296 70 70 1296 70 1296 1296
## [5461] 1296 70 70 70 70 70 70 70 70 70 70 70 70 70
## [5475] 70 70 70 70 70 70 70 70 70 70 70 70 70 70
## [5489] 70 70 70 70 70 70 70 70 70 70 70 70 70 70
## [5503] 70 70 70 70 70 70 70 70 70 70 70 70 70 70
## [5517] 70 70 70 70 70
countryname_s_e=ifelse( merged_mr$country %in% countrywithppm[countrywithppm %in% merged_mr$country], paste(countryname, "*", sep = ""), countryname)
mergedmedian$countryfullname = paste0(countryname_s_e," (",merged_mr$count1 ,")")
bp2 <- ggplot(mergedmedian, aes(x=countryfullname, y=value_mean, group=country)) +
labs(x = "Country", y = expression(paste("NO"[2], " ", mu, "g/", "m"^3)), cex = 1.5) +
geom_boxplot(aes(fill = median)) +
theme(text = element_text(size = 13), axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_distiller(palette = "Spectral")
# scale_color_brewer(direction = 1)
print(bp2 + ylim(0, 100))
#ggsave("boxplot.png")
Plot the paired correlation, for road predictors, population, Tropomi. For DE, CN, and world
merged_mr %>% na.omit %>% filter(country == "DE") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% filter(country == "US") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% filter(country == "CA") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% filter(country == "CN") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
Spatial dependency
grd_sp <- as_Spatial(locations_sf)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff= 1)
plot(dt.vgm)
#Moran I test
#install.packages("ape", dependencies = TRUE)
#library(ape)
#merged_mrf = merged_mr%>%filter(country == "US")
#no2.dists <- as.matrix(dist(cbind(merged_mrf$LONGITUDE, merged_mrf$LATITUDE)))
#no2.dists[1:5, 1:5]
#no2.inv <- 1/no2.dists
#diag(no2.inv) <- 0
#no2.inv[1:5, 1:5]
#Moran.I(merged_mrf$value_mean, na.rm = T, no2.inv)
countryvariogram = function(COUN, cutoff){
loca = locations_sf%>%filter(country == COUN)
grd_sp <- as_Spatial(loca)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff = cutoff)
plot(dt.vgm)
}
countryvariogram("DE", 1)
countryvariogram("US", 1)
countryvariogram("CN", 1) # reason?
For the illustration purpose, we simply remove missing data, there are not many, 41. In practice, careful handling is needed to choose between removing missing data, imputation or handle them explicitly in the model.
inde_var = merged_mr %>%na.omit() # 70 variables
names(inde_var)
## [1] "road_class_M345_1000" "road_class_M345_100" "road_class_M345_25"
## [4] "road_class_M345_3000" "road_class_M345_300" "road_class_M345_5000"
## [7] "road_class_M345_500" "road_class_M345_50" "road_class_M345_800"
## [10] "long" "lat" "nightlight_800"
## [13] "nightlight_500" "nightlight_5000" "nightlight_3000"
## [16] "nightlight_1000" "elevation" "industry_1000"
## [19] "industry_100" "industry_25" "industry_3000"
## [22] "industry_300" "industry_5000" "industry_500"
## [25] "industry_50" "industry_800" "OMI_mean_filt"
## [28] "population_1000" "population_3000" "population_5000"
## [31] "road_class_1_1000" "road_class_1_100" "road_class_1_25"
## [34] "road_class_1_3000" "road_class_1_300" "road_class_1_5000"
## [37] "road_class_1_500" "road_class_1_50" "road_class_1_800"
## [40] "road_class_2_1000" "road_class_2_100" "road_class_2_25"
## [43] "road_class_2_3000" "road_class_2_300" "road_class_2_5000"
## [46] "road_class_2_500" "road_class_2_50" "road_class_2_800"
## [49] "Rsp" "temperature_2m_10" "temperature_2m_11"
## [52] "temperature_2m_12" "temperature_2m_1" "temperature_2m_2"
## [55] "temperature_2m_3" "temperature_2m_4" "temperature_2m_5"
## [58] "temperature_2m_6" "temperature_2m_7" "temperature_2m_8"
## [61] "temperature_2m_9" "trop_mean_filt" "wind_speed_10m_10"
## [64] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_1"
## [67] "wind_speed_10m_2" "wind_speed_10m_3" "wind_speed_10m_4"
## [70] "wind_speed_10m_5" "wind_speed_10m_6" "wind_speed_10m_7"
## [73] "wind_speed_10m_8" "wind_speed_10m_9" "value_mean"
## [76] "country" "count1"
The scatter plots between predictors and mean NO2, Germany and global datasets. loess: moving regression, non-parametric, each smoothed value is given by a weighted linear least squares regression over the span.
inde_var %>% filter(country=="DE")%>%ungroup%>% dplyr::select(matches("road_class_M345|nightlight|temperature_2m_7|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "loess")
inde_var %>% filter(country=="DE")%>%ungroup%>% dplyr::select(matches("road_class_2|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "gam")
inde_var %>%ungroup%>% dplyr::select(matches("road_class_M345|nightlight|temperature_2m_7|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "loess")
inde_var %>% ungroup%>% dplyr::select(matches("road_class_2|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "loess")
inde_var %>% ungroup%>% dplyr::select(matches("road_class_1|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "gam")
#inde_var %>% dplyr::select(matches("Tro|OMI|Rsp|night_value")) %>% scatterplot(y_name = "night_value", fittingmethod = "loess")
# why?
# can choose any variable to look at the scatterplot
#inde_var %>% dplyr::select(matches("ROAD_1|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")
Discussion 1, try different scatterplot and discuss about the findings
If we use a single regression tree, the tree and prediction error will be different if shuffeling training and testing data. To reduce the variance, a method called “bagging” is developed, which agregate over (weak) voters. If the predictor variables are correlated, the reduction in variance is decreasing, Random Forest comes to resecue by ramdomly sampling variables.
XGBoost is surely popular, as it won the Kaggle (machine learning prediction) competition. It has a few features such as being scalable, but most importantly, it super-imposed a regularization to prevent model over-fitting. However, in my experience, though I often obtain the lowest RMSE or highest R2 with XGBoost, the spatial pattern it predicts look a lot worse than random forest, or simpler linear model with regularization – Lasso. It looks to predict lots of artefacts. We further compared various model prediction results with mobile sensor measurements (with a typical Dutch transporting tool “bakfiets”, which is a cargo-bike), and found XGBoost matches the least with the mobile sensor measurements! – No matter how I tune the model.
inde_var = inde_var%>%ungroup()%>%dplyr::select(-country)
Let’s come back to the model tunning, here I am showing to do this with the R package Caret, there are other packages,such as mlr3, but but Caret seems to be well-maintained, and is sufficient in most cases, and you can simply use ggplot on the caret::train object. All we need is to custom a tunning grid for tunning and control the resampling method.
Firstly, Random Forest, the most important hyperparameter are mtry - the number of variables sampled at each split, and min.node.size - the minimum number of observations at the leaf. The number of trees is also a hyperparameter, but it can be set as high as you like, as it will not cause model over-fitting as each tree is grown independently, which is different from boosting, which grows trees subsequently. Of course, you can also tune it.
[Try after the workshop as it takes quite a while; set the “eval =T”]
trl <- trainControl(method = "cv", number = 3) # control the resampling method
tgrid <- expand.grid(
.mtry = seq(7, 30, by = 3),
.splitrule = "variance",
.min.node.size = c(5, 10)
)
caret::train(value_mean ~ ., data = inde_var , method='ranger', trControl = trl, tuneGrid= tgrid) %>%ggplot()
#The final values used for the model were mtry = 25, splitrule = variance and min.node.size = 5.
In the same way, we can tune GBM [ Try after the workshop as it takes quite a while].
Tunning XGBoost is more complex as it has a lot more hyperparameters to tune: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
1 gamma[default=0][range: (0,Inf)] It controls regularization (or prevents overfitting). The optimal value of gamma depends on the data set and other parameter values. Higher the value, higher the regularization. Regularization means penalizing large coefficients which don’t improve the model’s performance. default = 0 means no regularization. Tune trick: Start with 0 and check CV error rate. If you see train error >> test error, bring gamma into action.
2. lambda and Alpha: similar to the Lasso Lambda, control the strength of regularization
3. nrounds[default=100] It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow. Should be tuned using CV
4. eta[default=0.3][range: (0,1)] It controls the learning rate, i.e., the rate at which our model learns patterns in data. After every round, it shrinks the feature weights to reach the best optimum. Lower eta leads to slower computation. It must be supported by increase in nrounds. Typically, it lies between 0.01 - 0.3
5. max_depth[default=6][range: (0,Inf)] It controls the depth of the tree. Larger data sets require deep trees to learn the rules from data. Should be tuned using CV
xgboostgrid = expand.grid(nrounds = seq(300, 2000, by = 50), max_depth = 3:5, eta = seq(0.05, 0.2, by = 0.05),gamma = 1,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 0.7)
trainControl <- trainControl(method="cv", number=5, savePredictions = "final", allowParallel = T) #5 - folds
# train the model
train(value_mean~., data=inde_var, method="xgbTree", trControl=trainControl, tuneGrid =xgboostgrid)%>%ggplot()
If you train a single train, e can see the tree is stochastic. But we can look at the tree structure to get some intuition of the model structure.
for (i in 2:5)
{
set.seed(i)
ctree_LUR(inde_var, y_varname= c("value_mean"), training = training, test = test, grepstring = vastring)
}
Creates diverse set of trees because 1) trees are unstable w.r.t. changes in learning/training data (bagging) 2) randomly preselect mtry splitting variables in each split
ranger(value_mean~., data = inde_var)
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = inde_var)
##
## Type: Regression
## Number of trees: 500
## Sample size: 5401
## Number of independent variables: 75
## Mtry: 8
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 49.57375
## R squared (OOB): 0.7456237
Here I am showing the “gbm” package. The “dismo” package extends “gbm” with the deviviance calculated from a distribution that can be chosen by users. Note though the dismo is built on gbm functions, the hyperparameters are slightly different.
gbm1 = gbm(formula = value_mean~., data =inde_var, distribution = "gaussian", n.trees = 2000,
interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5 )
gbm1
## gbm(formula = value_mean ~ ., distribution = "gaussian", data = inde_var,
## n.trees = 2000, interaction.depth = 6, shrinkage = 0.01,
## bag.fraction = 0.5)
## A gradient boosted model with gaussian loss function.
## 2000 iterations were performed.
## There were 75 predictors of which 75 had non-zero influence.
summary(gbm1)
## var rel.inf
## trop_mean_filt trop_mean_filt 32.016734927
## population_5000 population_5000 13.133285240
## population_3000 population_3000 8.571061123
## population_1000 population_1000 5.957151619
## long long 2.424777011
## road_class_2_25 road_class_2_25 1.873000603
## road_class_2_50 road_class_2_50 1.838589429
## count1 count1 1.508331588
## wind_speed_10m_3 wind_speed_10m_3 1.471829284
## road_class_2_100 road_class_2_100 1.376526682
## road_class_M345_3000 road_class_M345_3000 1.264089987
## road_class_1_50 road_class_1_50 1.260928765
## road_class_1_25 road_class_1_25 1.110781574
## road_class_M345_100 road_class_M345_100 0.941625654
## OMI_mean_filt OMI_mean_filt 0.923426286
## road_class_M345_1000 road_class_M345_1000 0.912365008
## road_class_M345_300 road_class_M345_300 0.897958609
## nightlight_500 nightlight_500 0.871125862
## road_class_1_300 road_class_1_300 0.858054848
## road_class_M345_25 road_class_M345_25 0.829406515
## Rsp Rsp 0.775062723
## road_class_M345_50 road_class_M345_50 0.758643865
## road_class_2_3000 road_class_2_3000 0.733065408
## road_class_1_5000 road_class_1_5000 0.729849445
## road_class_1_100 road_class_1_100 0.695343375
## road_class_M345_5000 road_class_M345_5000 0.679418994
## elevation elevation 0.669637508
## road_class_2_5000 road_class_2_5000 0.639759008
## road_class_M345_800 road_class_M345_800 0.627314390
## wind_speed_10m_2 wind_speed_10m_2 0.619818788
## temperature_2m_10 temperature_2m_10 0.609072562
## lat lat 0.565608441
## temperature_2m_1 temperature_2m_1 0.541776808
## temperature_2m_7 temperature_2m_7 0.526653533
## road_class_M345_500 road_class_M345_500 0.522454845
## wind_speed_10m_12 wind_speed_10m_12 0.488563934
## wind_speed_10m_1 wind_speed_10m_1 0.478766683
## road_class_1_500 road_class_1_500 0.456565128
## temperature_2m_5 temperature_2m_5 0.432207996
## industry_5000 industry_5000 0.424802504
## temperature_2m_12 temperature_2m_12 0.422786015
## road_class_1_3000 road_class_1_3000 0.405707108
## nightlight_5000 nightlight_5000 0.386611431
## nightlight_3000 nightlight_3000 0.368263399
## temperature_2m_6 temperature_2m_6 0.360916615
## temperature_2m_4 temperature_2m_4 0.352551575
## wind_speed_10m_4 wind_speed_10m_4 0.351821568
## wind_speed_10m_6 wind_speed_10m_6 0.339811802
## temperature_2m_3 temperature_2m_3 0.339799443
## temperature_2m_9 temperature_2m_9 0.334153610
## wind_speed_10m_5 wind_speed_10m_5 0.323722236
## temperature_2m_2 temperature_2m_2 0.317587531
## nightlight_1000 nightlight_1000 0.308780342
## wind_speed_10m_11 wind_speed_10m_11 0.306084285
## road_class_2_300 road_class_2_300 0.290062201
## temperature_2m_8 temperature_2m_8 0.271336088
## road_class_1_800 road_class_1_800 0.249461316
## industry_3000 industry_3000 0.245802757
## wind_speed_10m_8 wind_speed_10m_8 0.242670940
## wind_speed_10m_10 wind_speed_10m_10 0.223724095
## nightlight_800 nightlight_800 0.222162085
## road_class_1_1000 road_class_1_1000 0.196000467
## road_class_2_1000 road_class_2_1000 0.176900727
## temperature_2m_11 temperature_2m_11 0.167128883
## wind_speed_10m_7 wind_speed_10m_7 0.157561120
## road_class_2_500 road_class_2_500 0.148244783
## wind_speed_10m_9 wind_speed_10m_9 0.146966223
## road_class_2_800 road_class_2_800 0.097349454
## industry_1000 industry_1000 0.085039200
## industry_300 industry_300 0.058057090
## industry_800 industry_800 0.056610903
## industry_500 industry_500 0.014752765
## industry_100 industry_100 0.009351240
## industry_50 industry_50 0.003665431
## industry_25 industry_25 0.003118718
plot(gbm1, i.var = 2:3)
#plot(gbm1, i.var = 1)
#rf_residual <- pre_rf - rdf_test$NO2
y_varname= "value_mean"
x = inde_var%>%dplyr::select(-value_mean)
df_x = data.table(inde_var, keep.rownames = F)
df_y = inde_var$value_mean
formu = as.formula(paste(y_varname, "~.", sep = ""))
dfmatrix_x = sparse.model.matrix(formu, data = df_x)[, -1]
train_b = xgb.DMatrix(data = dfmatrix_x, label = df_y)
max_depth = 4
eta = 0.01
nthread = 4
nrounds = 800
Gamma = 2
bst <- xgboost(data = train_b, max_depth = max_depth, eta = eta, nthread = nthread, nrounds = nrounds, Gamma = Gamma, verbose = 1, print_every_n = 200, early_stopping_rounds = 50, maximize = F )
## [1] train-rmse:27.563074
## Will train until train_rmse hasn't improved in 50 rounds.
##
## [201] train-rmse:8.127597
## [401] train-rmse:6.285890
## [601] train-rmse:5.835217
## [800] train-rmse:5.604253
caretEnsemble is computational intensive. Can customize using the “emsemble.R”
#install.packages("caretEnsemble")
#library(caretEnsemble)
# Stacking Algorithms - Run multiple algos in one call.
#trainControl <- trainControl(method = "repeatedcv",
# number = 10,
# repeats = 2,
# savePredictions = TRUE,
# classProbs = TRUE)
#algorithmList <- c('rf', 'adaboost', 'earth', 'xgbDART', 'svmRadial')
#set.seed(100)
#models <- caretList(value_mean ~ ., data = inde_var , trControl = trainControl, methodList = algorithmList)
#results <- resamples(models)
#summary(results)
Variable importance and partial dependence plots are commonly used to interpret models.
pre_mat = subset_grep(inde_var , grepstring = paste0("value_mean|",vastring))
rf = ranger(value_mean~ ., data = na.omit( pre_mat), mtry = 25, num.trees = 1000, importance = "permutation")
rf
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = na.omit(pre_mat), mtry = 25, num.trees = 1000, importance = "permutation")
##
## Type: Regression
## Number of trees: 1000
## Sample size: 5401
## Number of independent variables: 70
## Mtry: 25
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 50.09529
## R squared (OOB): 0.7429476
# ranger method
sort(importance(rf), decreasing = T)
## trop_mean_filt population_5000 population_3000
## 87.572685329 32.305847482 26.424867012
## population_1000 wind_speed_10m_3 road_class_2_5000
## 16.497501545 11.579361023 11.569234721
## wind_speed_10m_2 temperature_2m_5 temperature_2m_7
## 9.677357100 5.437506395 4.807025962
## wind_speed_10m_1 wind_speed_10m_5 temperature_2m_2
## 4.661552208 4.545673438 3.989624971
## road_class_2_50 road_class_2_25 wind_speed_10m_9
## 3.974988113 3.786308301 3.750904804
## temperature_2m_1 road_class_1_5000 road_class_1_3000
## 3.595540685 3.556345295 3.448923834
## wind_speed_10m_4 wind_speed_10m_8 road_class_2_100
## 3.434108119 3.429254266 3.356673009
## temperature_2m_12 temperature_2m_9 temperature_2m_10
## 3.309390472 3.279753558 3.239252491
## wind_speed_10m_12 road_class_2_3000 nightlight_3000
## 3.219914602 3.129851680 2.992405883
## wind_speed_10m_10 wind_speed_10m_6 temperature_2m_4
## 2.966142423 2.942553873 2.900021673
## nightlight_500 nightlight_5000 temperature_2m_6
## 2.820524880 2.814736987 2.805224659
## temperature_2m_3 temperature_2m_8 nightlight_800
## 2.729032306 2.665641877 2.585413702
## temperature_2m_11 nightlight_1000 wind_speed_10m_11
## 2.442832069 2.339505347 2.317944240
## road_class_M345_5000 road_class_1_50 road_class_M345_300
## 2.105370178 2.081714570 2.070152481
## road_class_M345_100 road_class_M345_3000 wind_speed_10m_7
## 1.802213839 1.799516271 1.702835872
## elevation road_class_2_300 road_class_M345_500
## 1.527847149 1.402257916 1.371340134
## road_class_1_100 road_class_M345_800 road_class_1_25
## 1.369627774 1.358332724 1.318970500
## road_class_M345_50 road_class_M345_1000 road_class_1_300
## 1.292768021 1.249324390 1.229975357
## road_class_M345_25 industry_5000 industry_3000
## 1.227033988 1.081789153 1.012055112
## road_class_1_500 road_class_2_500 road_class_2_1000
## 1.009913109 0.910626462 0.676039536
## road_class_1_800 road_class_1_1000 road_class_2_800
## 0.636319022 0.514185508 0.503674516
## industry_1000 industry_800 industry_500
## 0.320207113 0.149672728 0.063685866
## industry_300 industry_100 industry_25
## 0.050756195 0.006801649 0.002092119
## industry_50
## -0.004064185
[Try after workshop as it takes a while: Variable importance and partial dependence plots can be calculated and presented using vi and sparklines packages, which include more matrices and presentation functionalities. ]
set.seed(2)
vip::list_metrics()
DF_P_r2 = vi(rf, method = "permute", target = "value_mean", metric = "r2" ) # very clear what method is used and what is the target
DF_P_rmse = vi(rf, method = "permute", target = "value_mean", metric = "rmse")
vip (DF_P_r2)
vip (DF_P_rmse)
a=add_sparklines(DF_P_r2, fit = rf)
saveWidget(a, file="sparkline.html")
Use a subset of variables in RF to inspect partial denpendence.
pre_mat_s = inde_var%>%na.omit%>%ungroup() %>% dplyr::select(value_mean, road_class_2_50, nightlight_500, population_5000, road_class_M345_1000)
lm_s = lm(value_mean~., data = pre_mat_s)
rf_s = ranger(value_mean~ ., data = pre_mat_s, num.trees = 1000, importance = "permutation")
rf_s
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat_s, num.trees = 1000, importance = "permutation")
##
## Type: Regression
## Number of trees: 1000
## Sample size: 5401
## Number of independent variables: 4
## Mtry: 2
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 100.1319
## R squared (OOB): 0.486196
Partial dependence is the marginal effect one or two features have on the predicted outcome of a machine learning model. It is the integration of all the other predictors given each value of the variable under inspection. Partial dependence plot is calculated based on the assumption that the correlation between the variable to be inspected and variables to be integrated to be low. For example, if we are interested in the effect of population within 1000 m buffer, but we integrate over population within 3000 m buffer, then high population region as is shown with the 1000 m buffer data may include very low popolation within 3000 m buffer, which is very unlikely.
pre_mat_predictor = pre_mat_s%>%dplyr::select(-value_mean)
ggpairs(pre_mat_predictor)
The partial dependence plots of linear and random forest regression
p_lm = partial(lm_s, "road_class_M345_1000",plot = TRUE, rug = TRUE)
plot(p_lm) # Linear regression
p2 = partial(rf_s, "road_class_M345_1000",plot = TRUE, rug = TRUE)
plot(p2) # random forest
We can also inspect 2D and 3-D PDP plots for more in-depth insights of the joint effects from variables. [Try afterwork shop as it will take a wile]
#slow
pd <- partial(rf_s, pred.var = c("population_3000", "road_class_M345_1000" ))
# Default PDP
pd1 = plotPartial(pd)
# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("red", "white", "blue"))
pdp2 = plotPartial(pd, contour = TRUE, col.regions = rwb)
#3-D surface
pdp3 <- plotPartial(pd, levelplot = F, zlab = "road_class_2_50", colorkey = T,
screen = list(z = -20, x = -60) )
p3 = partial(rf_s, "road_class_2_50", plot = TRUE, rug = TRUE)
p1 = partial(rf_s, "population_3000", plot = TRUE, rug = TRUE)
grid.arrange(p1, p2, p3, pd1, pdp2, ncol = 2)